Data

  • the quantified peak intensities data from ABRF 2015, processed by Skyline.
  • The processed data, quant.skyline.rda, which is output of dataProcess from section 2

1. Protein quantification from the output of dataProcess in MSstats

1.1 Read the output of dataProcess

load('./data_day3/quant.skyline.rda')

It should use the output of quantification function in MSstats in order to get subject quantification for each protein. In this ABRF study, there is no biological replicate. Let’s use run-level quantification for protein instead as an example.

head(quant.skyline$RunlevelData)
##   RUN              Protein LogIntensities NumMeasuredFeature
## 1   1 sp|D6VTK4|STE2_YEAST       26.81232                  8
## 2   2 sp|D6VTK4|STE2_YEAST       26.60786                  8
## 3   3 sp|D6VTK4|STE2_YEAST       26.58301                  8
## 4   4 sp|D6VTK4|STE2_YEAST       26.83563                  8
## 5   5 sp|D6VTK4|STE2_YEAST       26.79430                  8
## 6   6 sp|D6VTK4|STE2_YEAST       26.60863                  8
##   MissingPercentage more50missing NumImputedFeature
## 1                 0         FALSE                 0
## 2                 0         FALSE                 0
## 3                 0         FALSE                 0
## 4                 0         FALSE                 0
## 5                 0         FALSE                 0
## 6                 0         FALSE                 0
##                 originalRUN GROUP GROUP_ORIGINAL SUBJECT_ORIGINAL
## 1 JD_06232014_sample1_B.raw     1     Condition1                1
## 2 JD_06232014_sample1_C.raw     1     Condition1                1
## 3 JD_06232014_sample1-A.raw     1     Condition1                1
## 4 JD_06232014_sample2_A.raw     2     Condition2                2
## 5 JD_06232014_sample2_B.raw     2     Condition2                2
## 6 JD_06232014_sample2_C.raw     2     Condition2                2
##   SUBJECT_NESTED SUBJECT
## 1            1.1       1
## 2            1.1       1
## 3            1.1       1
## 4            2.2       2
## 5            2.2       2
## 6            2.2       2
runquant <- quant.skyline$RunlevelData
head(runquant)
##   RUN              Protein LogIntensities NumMeasuredFeature
## 1   1 sp|D6VTK4|STE2_YEAST       26.81232                  8
## 2   2 sp|D6VTK4|STE2_YEAST       26.60786                  8
## 3   3 sp|D6VTK4|STE2_YEAST       26.58301                  8
## 4   4 sp|D6VTK4|STE2_YEAST       26.83563                  8
## 5   5 sp|D6VTK4|STE2_YEAST       26.79430                  8
## 6   6 sp|D6VTK4|STE2_YEAST       26.60863                  8
##   MissingPercentage more50missing NumImputedFeature
## 1                 0         FALSE                 0
## 2                 0         FALSE                 0
## 3                 0         FALSE                 0
## 4                 0         FALSE                 0
## 5                 0         FALSE                 0
## 6                 0         FALSE                 0
##                 originalRUN GROUP GROUP_ORIGINAL SUBJECT_ORIGINAL
## 1 JD_06232014_sample1_B.raw     1     Condition1                1
## 2 JD_06232014_sample1_C.raw     1     Condition1                1
## 3 JD_06232014_sample1-A.raw     1     Condition1                1
## 4 JD_06232014_sample2_A.raw     2     Condition2                2
## 5 JD_06232014_sample2_B.raw     2     Condition2                2
## 6 JD_06232014_sample2_C.raw     2     Condition2                2
##   SUBJECT_NESTED SUBJECT
## 1            1.1       1
## 2            1.1       1
## 3            1.1       1
## 4            2.2       2
## 5            2.2       2
## 6            2.2       2

1.2 Reformat to wide format

Then let’s make wide format of data from long format.

library(reshape2)

input.wide <- dcast(Protein ~ originalRUN, data=runquant, value.var = 'LogIntensities')
head(input.wide)
##                 Protein JD_06232014_sample1_B.raw
## 1  sp|D6VTK4|STE2_YEAST                  26.81232
## 2  sp|O13297|CET1_YEAST                  24.71912
## 3  sp|O13329|FOB1_YEAST                  23.37678
## 4  sp|O13539|THP2_YEAST                  27.52021
## 5 sp|O13547|CCW14_YEAST                  27.22234
## 6 sp|O13563|RPN13_YEAST                  26.09476
##   JD_06232014_sample1_C.raw JD_06232014_sample1-A.raw
## 1                  26.60786                  26.58301
## 2                  24.67437                  24.71809
## 3                  24.01464                  23.47075
## 4                  27.38510                  24.29661
## 5                  26.80454                  27.11638
## 6                  26.29253                  26.17056
##   JD_06232014_sample2_A.raw JD_06232014_sample2_B.raw
## 1                  26.83563                  26.79430
## 2                  24.52341                  24.67593
## 3                  23.17165                  23.43427
## 4                  25.88066                  26.10254
## 5                  27.17386                  26.91302
## 6                  25.90882                  26.12989
##   JD_06232014_sample2_C.raw JD_06232014_sample3_A.raw
## 1                  26.60863                  26.62966
## 2                  24.57865                  24.38955
## 3                  23.76997                  24.20100
## 4                  25.90646                  25.65002
## 5                  26.88314                  26.56416
## 6                  26.01078                  26.17791
##   JD_06232014_sample3_B.raw JD_06232014_sample3_C.raw
## 1                  26.49626                  26.53029
## 2                  24.62652                  24.66197
## 3                  23.28643                  23.73741
## 4                  26.33853                  25.91799
## 5                  26.75541                  26.95027
## 6                  26.03753                  26.11412
##   JD_06232014_sample4_B.raw JD_06232014_sample4_C.raw
## 1                  26.60612                  26.38611
## 2                  24.64886                  24.66559
## 3                  23.16646                  23.60780
## 4                  26.70101                  25.91781
## 5                  26.98082                  26.80274
## 6                  26.07538                  26.05415
##   JD_06232014_sample4-A.raw
## 1                  26.65573
## 2                  24.50814
## 3                  23.03473
## 4                  25.07576
## 5                  27.07526
## 6                  25.77958
rownames(input.wide) <- input.wide$Protein
input.wide <- input.wide[, -1]
head(input.wide)
##                       JD_06232014_sample1_B.raw JD_06232014_sample1_C.raw
## sp|D6VTK4|STE2_YEAST                   26.81232                  26.60786
## sp|O13297|CET1_YEAST                   24.71912                  24.67437
## sp|O13329|FOB1_YEAST                   23.37678                  24.01464
## sp|O13539|THP2_YEAST                   27.52021                  27.38510
## sp|O13547|CCW14_YEAST                  27.22234                  26.80454
## sp|O13563|RPN13_YEAST                  26.09476                  26.29253
##                       JD_06232014_sample1-A.raw JD_06232014_sample2_A.raw
## sp|D6VTK4|STE2_YEAST                   26.58301                  26.83563
## sp|O13297|CET1_YEAST                   24.71809                  24.52341
## sp|O13329|FOB1_YEAST                   23.47075                  23.17165
## sp|O13539|THP2_YEAST                   24.29661                  25.88066
## sp|O13547|CCW14_YEAST                  27.11638                  27.17386
## sp|O13563|RPN13_YEAST                  26.17056                  25.90882
##                       JD_06232014_sample2_B.raw JD_06232014_sample2_C.raw
## sp|D6VTK4|STE2_YEAST                   26.79430                  26.60863
## sp|O13297|CET1_YEAST                   24.67593                  24.57865
## sp|O13329|FOB1_YEAST                   23.43427                  23.76997
## sp|O13539|THP2_YEAST                   26.10254                  25.90646
## sp|O13547|CCW14_YEAST                  26.91302                  26.88314
## sp|O13563|RPN13_YEAST                  26.12989                  26.01078
##                       JD_06232014_sample3_A.raw JD_06232014_sample3_B.raw
## sp|D6VTK4|STE2_YEAST                   26.62966                  26.49626
## sp|O13297|CET1_YEAST                   24.38955                  24.62652
## sp|O13329|FOB1_YEAST                   24.20100                  23.28643
## sp|O13539|THP2_YEAST                   25.65002                  26.33853
## sp|O13547|CCW14_YEAST                  26.56416                  26.75541
## sp|O13563|RPN13_YEAST                  26.17791                  26.03753
##                       JD_06232014_sample3_C.raw JD_06232014_sample4_B.raw
## sp|D6VTK4|STE2_YEAST                   26.53029                  26.60612
## sp|O13297|CET1_YEAST                   24.66197                  24.64886
## sp|O13329|FOB1_YEAST                   23.73741                  23.16646
## sp|O13539|THP2_YEAST                   25.91799                  26.70101
## sp|O13547|CCW14_YEAST                  26.95027                  26.98082
## sp|O13563|RPN13_YEAST                  26.11412                  26.07538
##                       JD_06232014_sample4_C.raw JD_06232014_sample4-A.raw
## sp|D6VTK4|STE2_YEAST                   26.38611                  26.65573
## sp|O13297|CET1_YEAST                   24.66559                  24.50814
## sp|O13329|FOB1_YEAST                   23.60780                  23.03473
## sp|O13539|THP2_YEAST                   25.91781                  25.07576
## sp|O13547|CCW14_YEAST                  26.80274                  27.07526
## sp|O13563|RPN13_YEAST                  26.05415                  25.77958
dim(input.wide) # there are 3027 rows (one row per protein) and 12 columns (each colomn for run)
## [1] 3027   12

Please check whether there are missing values (NAs) or not. If there is no intensity at all in certain run for certain protein, we can’t get run-summarized for that run.

sum(is.na(input.wide))
## [1] 3

There are three NAs. Then, we need to decide how to deal with NAs. * First option: remove the samples with missing values * Second option: impute the missing values. Then need to how to impute. 1) impute with zero, 2) with minimum value per protein or among all proteins, 3) with median or mean, 4) random sampling among other run-summarized value per protein.

In this case, let’s remove proteins with any NAs.

samplemissing <- apply(input.wide, 1, function(x) sum(is.na(x)))
unique(samplemissing)
## [1] 0 1 2
selectedSamples <- which(samplemissing == 0) 
# Only keep the samples with no missing values
input.wide <- input.wide[selectedSamples, ]
dim(input.wide)
## [1] 3025   12

Two samples, which have any NAs, are removed.

Let’s make another data with only spike-in proteins.

input.wide.spike <- input.wide[rownames(input.wide) %in% c("sp|P44015|VAC2_YEAST",
                                            "sp|P55752|ISCB_YEAST",
                                            "sp|P44374|SFG2_YEAST",
                                            "sp|P44983|UTR6_YEAST",
                                            "sp|P44683|PGA4_YEAST",
                                            "sp|P55249|ZRT4_YEAST"),]

input.wide.spike
##                      JD_06232014_sample1_B.raw JD_06232014_sample1_C.raw
## sp|P44015|VAC2_YEAST                  26.30163                  26.11643
## sp|P44374|SFG2_YEAST                  24.26559                  24.27609
## sp|P44683|PGA4_YEAST                  22.47239                  22.56104
## sp|P44983|UTR6_YEAST                  21.29173                  21.38781
## sp|P55249|ZRT4_YEAST                  21.86421                  21.41435
## sp|P55752|ISCB_YEAST                  27.47412                  27.69489
##                      JD_06232014_sample1-A.raw JD_06232014_sample2_A.raw
## sp|P44015|VAC2_YEAST                  26.29089                  25.81957
## sp|P44374|SFG2_YEAST                  24.23382                  21.36784
## sp|P44683|PGA4_YEAST                  22.52561                  20.06196
## sp|P44983|UTR6_YEAST                  21.31825                  27.51134
## sp|P55249|ZRT4_YEAST                  21.18725                  27.93246
## sp|P55752|ISCB_YEAST                  27.34912                  24.59934
##                      JD_06232014_sample2_B.raw JD_06232014_sample2_C.raw
## sp|P44015|VAC2_YEAST                  26.11527                  26.08498
## sp|P44374|SFG2_YEAST                  21.20330                  21.19631
## sp|P44683|PGA4_YEAST                  20.18183                  20.72974
## sp|P44983|UTR6_YEAST                  27.46338                  27.33788
## sp|P55249|ZRT4_YEAST                  27.75780                  27.83651
## sp|P55752|ISCB_YEAST                  24.62946                  24.70403
##                      JD_06232014_sample3_A.raw JD_06232014_sample3_B.raw
## sp|P44015|VAC2_YEAST                  23.14806                  23.32465
## sp|P44374|SFG2_YEAST                  26.90000                  26.85056
## sp|P44683|PGA4_YEAST                  21.72482                  21.93912
## sp|P44983|UTR6_YEAST                  26.94886                  26.92480
## sp|P55249|ZRT4_YEAST                  21.04481                  21.00220
## sp|P55752|ISCB_YEAST                  21.37264                  20.27802
##                      JD_06232014_sample3_C.raw JD_06232014_sample4_B.raw
## sp|P44015|VAC2_YEAST                  23.29555                  20.94536
## sp|P44374|SFG2_YEAST                  26.90190                  26.61875
## sp|P44683|PGA4_YEAST                  21.96229                  29.20591
## sp|P44983|UTR6_YEAST                  26.85774                  23.86204
## sp|P55249|ZRT4_YEAST                  21.02761                  20.71864
## sp|P55752|ISCB_YEAST                  20.44112                  27.36941
##                      JD_06232014_sample4_C.raw JD_06232014_sample4-A.raw
## sp|P44015|VAC2_YEAST                  21.71424                  20.25209
## sp|P44374|SFG2_YEAST                  26.60961                  26.69464
## sp|P44683|PGA4_YEAST                  29.12515                  29.21482
## sp|P44983|UTR6_YEAST                  23.76070                  24.01492
## sp|P55249|ZRT4_YEAST                  20.69131                  20.37886
## sp|P55752|ISCB_YEAST                  27.53762                  27.34758

I will save two wide format data.

save(input.wide.spike, file = 'input.wide.spike.rda')
save(input.wide.spike, file = 'input.wide.spike.rda')

2. Perform principal components analysis (PCA) on the given data matrix

2.1 Only 6 spike-in proteins

2.1.1 PCA with prcomp function

First, let’s try PCA with 6 spike-in proteins only. Input has the row for proteins and the column for run (subject).

head(input.wide.spike)
##                      JD_06232014_sample1_B.raw JD_06232014_sample1_C.raw
## sp|P44015|VAC2_YEAST                  26.30163                  26.11643
## sp|P44374|SFG2_YEAST                  24.26559                  24.27609
## sp|P44683|PGA4_YEAST                  22.47239                  22.56104
## sp|P44983|UTR6_YEAST                  21.29173                  21.38781
## sp|P55249|ZRT4_YEAST                  21.86421                  21.41435
## sp|P55752|ISCB_YEAST                  27.47412                  27.69489
##                      JD_06232014_sample1-A.raw JD_06232014_sample2_A.raw
## sp|P44015|VAC2_YEAST                  26.29089                  25.81957
## sp|P44374|SFG2_YEAST                  24.23382                  21.36784
## sp|P44683|PGA4_YEAST                  22.52561                  20.06196
## sp|P44983|UTR6_YEAST                  21.31825                  27.51134
## sp|P55249|ZRT4_YEAST                  21.18725                  27.93246
## sp|P55752|ISCB_YEAST                  27.34912                  24.59934
##                      JD_06232014_sample2_B.raw JD_06232014_sample2_C.raw
## sp|P44015|VAC2_YEAST                  26.11527                  26.08498
## sp|P44374|SFG2_YEAST                  21.20330                  21.19631
## sp|P44683|PGA4_YEAST                  20.18183                  20.72974
## sp|P44983|UTR6_YEAST                  27.46338                  27.33788
## sp|P55249|ZRT4_YEAST                  27.75780                  27.83651
## sp|P55752|ISCB_YEAST                  24.62946                  24.70403
##                      JD_06232014_sample3_A.raw JD_06232014_sample3_B.raw
## sp|P44015|VAC2_YEAST                  23.14806                  23.32465
## sp|P44374|SFG2_YEAST                  26.90000                  26.85056
## sp|P44683|PGA4_YEAST                  21.72482                  21.93912
## sp|P44983|UTR6_YEAST                  26.94886                  26.92480
## sp|P55249|ZRT4_YEAST                  21.04481                  21.00220
## sp|P55752|ISCB_YEAST                  21.37264                  20.27802
##                      JD_06232014_sample3_C.raw JD_06232014_sample4_B.raw
## sp|P44015|VAC2_YEAST                  23.29555                  20.94536
## sp|P44374|SFG2_YEAST                  26.90190                  26.61875
## sp|P44683|PGA4_YEAST                  21.96229                  29.20591
## sp|P44983|UTR6_YEAST                  26.85774                  23.86204
## sp|P55249|ZRT4_YEAST                  21.02761                  20.71864
## sp|P55752|ISCB_YEAST                  20.44112                  27.36941
##                      JD_06232014_sample4_C.raw JD_06232014_sample4-A.raw
## sp|P44015|VAC2_YEAST                  21.71424                  20.25209
## sp|P44374|SFG2_YEAST                  26.60961                  26.69464
## sp|P44683|PGA4_YEAST                  29.12515                  29.21482
## sp|P44983|UTR6_YEAST                  23.76070                  24.01492
## sp|P55249|ZRT4_YEAST                  20.69131                  20.37886
## sp|P55752|ISCB_YEAST                  27.53762                  27.34758
pc <- prcomp(t(input.wide.spike))

# Inspect PCA object
summary(pc)
## Importance of components%s:
##                           PC1    PC2    PC3    PC4     PC5     PC6
## Standard deviation     5.3154 3.6198 2.5680 0.2862 0.16542 0.11815
## Proportion of Variance 0.5877 0.2726 0.1372 0.0017 0.00057 0.00029
## Cumulative Proportion  0.5877 0.8603 0.9974 0.9991 0.99971 1.00000
names(pc)
## [1] "sdev"     "rotation" "center"   "scale"    "x"

2.1.2 Check the proportion of explained variance

Let’s check the proportion of explained variance. The first component has the largest variance. In this case, we need 3 components to capture most of the variation.

percent_var <- pc$sdev^2/sum(pc$sdev^2)
barplot(percent_var, xlab="Principle component", ylab="% of variance")

cum_var <- cumsum(pc$sdev^2/sum(pc$sdev^2))
barplot(cum_var, xlab="Principle component", ylab="Cumulative % of variance" )

2.1.3 Visualization for PC1 vs PC2

Let’s visualize PC1 vs PC2 in scatterplot. ‘x’ include PC components for each subject.

head(pc$x)
##                                  PC1      PC2       PC3         PC4
## JD_06232014_sample1_B.raw  0.5376666 3.932191 -3.115395 -0.06275524
## JD_06232014_sample1_C.raw  0.9065490 3.889990 -3.162756  0.16757835
## JD_06232014_sample1-A.raw  0.8629331 3.689371 -3.445799 -0.06053656
## JD_06232014_sample2_A.raw -7.3135082 1.192973  2.265224  0.25481904
## JD_06232014_sample2_B.raw -7.2865015 1.354552  2.135451  0.01480192
## JD_06232014_sample2_C.raw -6.9270993 1.512729  2.423839 -0.27208541
##                                    PC5         PC6
## JD_06232014_sample1_B.raw -0.034407021  0.25849848
## JD_06232014_sample1_C.raw  0.060173234 -0.05855078
## JD_06232014_sample1-A.raw -0.071114248 -0.19237301
## JD_06232014_sample2_A.raw  0.003991858  0.14244137
## JD_06232014_sample2_B.raw  0.033699006 -0.10337342
## JD_06232014_sample2_C.raw -0.048023671 -0.05503116
library(ggplot2)

ggplot(aes(x=PC1, y=PC2), data=data.frame(pc$x))+
  geom_point(size=4, alpha=0.5)+
  theme_bw()

In order to distinguish group by colors or shape, create vector of condition labels variable called Condition. The order should be the same as column of input.

colnames(input.wide.spike)
##  [1] "JD_06232014_sample1_B.raw" "JD_06232014_sample1_C.raw"
##  [3] "JD_06232014_sample1-A.raw" "JD_06232014_sample2_A.raw"
##  [5] "JD_06232014_sample2_B.raw" "JD_06232014_sample2_C.raw"
##  [7] "JD_06232014_sample3_A.raw" "JD_06232014_sample3_B.raw"
##  [9] "JD_06232014_sample3_C.raw" "JD_06232014_sample4_B.raw"
## [11] "JD_06232014_sample4_C.raw" "JD_06232014_sample4-A.raw"
Condition <- c(rep("Condition1", 3), rep("Condition2", 3),
               rep("Condition3", 3), rep("Condition4", 3))
Condition
##  [1] "Condition1" "Condition1" "Condition1" "Condition2" "Condition2"
##  [6] "Condition2" "Condition3" "Condition3" "Condition3" "Condition4"
## [11] "Condition4" "Condition4"
# Create PC1 vs PC2 scatterplot with Condition colors
ggplot(aes(x=PC1, y=PC2, color=Group), data=data.frame(pc$x, Group=Condition))+
  geom_point(size=4, alpha=0.5)+
  theme_bw()

We can create PCA scatterplots with custom Condition labels and colors.

ggplot(aes(x=PC1, y=PC2, color=Group, shape=Group), data=data.frame(pc$x, Group=Condition))+
  geom_point(size=4, alpha=0.5)+
  theme_bw()

ggplot(aes(x=PC1, y=PC3, color=Group, shape=Group), data=data.frame(pc$x, Group=Condition))+
  geom_point(size=4, alpha=0.5)+
  theme_bw()

ggplot(aes(x=PC2, y=PC3, color=Group, shape=Group), data=data.frame(pc$x, Group=Condition))+
  geom_point(size=4, alpha=0.5)+
  theme_bw()

2.1.4 Check the PC1 loadings

Let’s create plot of PC1 loadings. ‘rotation’ show the Protein loading for each component.

head(pc$rotation)
##                             PC1        PC2        PC3          PC4
## sp|P44015|VAC2_YEAST -0.3271751  0.3008252 -0.3804194 -0.668544685
## sp|P44374|SFG2_YEAST  0.3536458 -0.3705173 -0.2020736  0.009423747
## sp|P44683|PGA4_YEAST  0.6157778  0.0665221  0.5172269 -0.581893078
## sp|P44983|UTR6_YEAST -0.2864101 -0.4558387  0.4875131  0.093951595
## sp|P55249|ZRT4_YEAST -0.4990234  0.2270087  0.5359809 -0.116096974
## sp|P55752|ISCB_YEAST  0.2401119  0.7130736  0.1482540  0.438239928
##                              PC5         PC6
## sp|P44015|VAC2_YEAST  0.43898820 -0.13445862
## sp|P44374|SFG2_YEAST  0.54923192  0.62854860
## sp|P44683|PGA4_YEAST -0.05136461 -0.08735535
## sp|P44983|UTR6_YEAST  0.53571361 -0.42035144
## sp|P55249|ZRT4_YEAST -0.04695331  0.62966981
## sp|P55752|ISCB_YEAST  0.46238691 -0.07769923
ggplot(aes(x=Protein, xend=Protein, y=0, yend=PC1), 
       data=data.frame(pc$rotation, Protein=rownames(pc$rotation)))+
  geom_segment()+
  labs(y="PC1 loading")+
  theme_bw()+
  theme(axis.text.x = element_text(angle=90))

2.1.5 loadings vs p-values

Let’s compare to loadings to p-values. loadings close to zero mean less contribution. Many proteins with small loadings have significant p-values.

load('./data_day3/Skyline.result.rda')
head(Skyline.result)
##                  Protein Label      log2FC         SE     Tvalue DF
## 1   sp|D6VTK4|STE2_YEAST C2-C1  0.07846162 0.09648008  0.8132416  8
## 7   sp|O13297|CET1_YEAST C2-C1 -0.11119732 0.07749333 -1.4349278  8
## 13  sp|O13329|FOB1_YEAST C2-C1 -0.16209422 0.29090086 -0.5572146  8
## 19  sp|O13539|THP2_YEAST C2-C1 -0.43742107 0.82871351 -0.5278315  8
## 25 sp|O13547|CCW14_YEAST C2-C1 -0.05774773 0.14672405 -0.3935805  8
## 31 sp|O13563|RPN13_YEAST C2-C1 -0.16945187 0.09518591 -1.7802201  8
##       pvalue adj.pvalue issue MissingPercentage ImputationPercentage
## 1  0.4396100  0.9991155    NA                 0                    0
## 7  0.1892233  0.9991155    NA                 0                    0
## 13 0.5926243  0.9991155    NA                 0                    0
## 19 0.6119387  0.9991155    NA                 0                    0
## 25 0.7041724  0.9991155    NA                 0                    0
## 31 0.1129126  0.9991155    NA                 0                    0
Skyline.result.spikein <- Skyline.result[Skyline.result$Protein %in% c("sp|P44015|VAC2_YEAST",
                                            "sp|P55752|ISCB_YEAST",
                                            "sp|P44374|SFG2_YEAST",
                                            "sp|P44983|UTR6_YEAST",
                                            "sp|P44683|PGA4_YEAST",
                                            "sp|P55249|ZRT4_YEAST"), ]
Skyline.result.spikein
##                    Protein Label      log2FC         SE      Tvalue DF
## 10549 sp|P44015|VAC2_YEAST C2-C1 -0.22970949 0.31123051  -0.7380687  8
## 10555 sp|P44374|SFG2_YEAST C2-C1 -3.00268709 0.04643043 -64.6706761  8
## 10561 sp|P44683|PGA4_YEAST C2-C1 -2.19517053 0.15722789 -13.9617123  8
## 10567 sp|P44983|UTR6_YEAST C2-C1  6.10493811 0.06963413  87.6716342  8
## 13279 sp|P55249|ZRT4_YEAST C2-C1  6.35365502 0.16454664  38.6130957  8
## 13285 sp|P55752|ISCB_YEAST C2-C1 -2.86176857 0.25597097 -11.1800512  8
## 10550 sp|P44015|VAC2_YEAST C3-C1 -2.98022772 0.31123051  -9.5756284  8
## 10556 sp|P44374|SFG2_YEAST C3-C1  2.62564667 0.04643043  56.5501301  8
## 10562 sp|P44683|PGA4_YEAST C3-C1 -0.64427217 0.15722789  -4.0976965  8
## 10568 sp|P44983|UTR6_YEAST C3-C1  5.57787150 0.06963413  80.1025498  8
## 13280 sp|P55249|ZRT4_YEAST C3-C1 -0.46373228 0.16454664  -2.8182422  8
## 13286 sp|P55752|ISCB_YEAST C3-C1 -6.80878285 0.25597097 -26.5998242  8
## 10551 sp|P44015|VAC2_YEAST C4-C1 -5.26575535 0.31123051 -16.9191488  8
## 10557 sp|P44374|SFG2_YEAST C4-C1  2.38249477 0.04643043  51.3132216  8
## 10563 sp|P44683|PGA4_YEAST C4-C1  6.66227860 0.15722789  42.3733902  8
## 10569 sp|P44983|UTR6_YEAST C4-C1  2.54661911 0.06963413  36.5714206  8
## 13281 sp|P55249|ZRT4_YEAST C4-C1 -0.89233327 0.16454664  -5.4229809  8
## 13287 sp|P55752|ISCB_YEAST C4-C1 -0.08784197 0.25597097  -0.3431716  8
## 10552 sp|P44015|VAC2_YEAST C3-C2 -2.75051824 0.31123051  -8.8375597  8
## 10558 sp|P44374|SFG2_YEAST C3-C2  5.62833376 0.04643043 121.2208062  8
## 10564 sp|P44683|PGA4_YEAST C3-C2  1.55089836 0.15722789   9.8640158  8
## 10570 sp|P44983|UTR6_YEAST C3-C2 -0.52706661 0.06963413  -7.5690843  8
## 13282 sp|P55249|ZRT4_YEAST C3-C2 -6.81738730 0.16454664 -41.4313380  8
## 13288 sp|P55752|ISCB_YEAST C3-C2 -3.94701428 0.25597097 -15.4197730  8
## 10553 sp|P44015|VAC2_YEAST C4-C2 -5.03604587 0.31123051 -16.1810801  8
## 10559 sp|P44374|SFG2_YEAST C4-C2  5.38518186 0.04643043 115.9838977  8
## 10565 sp|P44683|PGA4_YEAST C4-C2  8.85744912 0.15722789  56.3351025  8
## 10571 sp|P44983|UTR6_YEAST C4-C2 -3.55831900 0.06963413 -51.1002136  8
## 13283 sp|P55249|ZRT4_YEAST C4-C2 -7.24598829 0.16454664 -44.0360766  8
## 13289 sp|P55752|ISCB_YEAST C4-C2  2.77392660 0.25597097  10.8368796  8
## 10554 sp|P44015|VAC2_YEAST C4-C3 -2.28552763 0.31123051  -7.3435204  8
## 10560 sp|P44374|SFG2_YEAST C4-C3 -0.24315190 0.04643043  -5.2369085  8
## 10566 sp|P44683|PGA4_YEAST C4-C3  7.30655076 0.15722789  46.4710867  8
## 10572 sp|P44983|UTR6_YEAST C4-C3 -3.03125238 0.06963413 -43.5311293  8
## 13284 sp|P55249|ZRT4_YEAST C4-C3 -0.42860099 0.16454664  -2.6047387  8
## 13290 sp|P55752|ISCB_YEAST C4-C3  6.72094088 0.25597097  26.2566526  8
##             pvalue   adj.pvalue issue MissingPercentage
## 10549 4.815597e-01 9.991155e-01    NA       0.000000000
## 10555 3.635536e-12 5.502384e-09    NA       0.000000000
## 10561 6.711258e-07 5.078744e-04    NA       0.006802721
## 10567 3.197442e-13 9.678658e-10    NA       0.004545455
## 13279 2.223177e-10 2.243186e-07    NA       0.000000000
## 13285 3.669632e-06 2.221595e-03    NA       0.000000000
## 10550 1.172208e-05 8.870685e-03    NA       0.000000000
## 10556 1.061307e-11 1.606288e-08    NA       0.000000000
## 10562 3.448716e-03 3.291921e-01    NA       0.005952381
## 10568 6.576961e-13 1.990846e-09    NA       0.004545455
## 13280 2.255486e-02 4.741219e-01    NA       0.003333333
## 13286 4.291489e-09 4.330112e-06    NA       0.000000000
## 10551 1.510378e-07 1.142978e-04    NA       0.000000000
## 10557 2.304823e-11 6.976699e-08    NA       0.000000000
## 10563 1.060525e-10 1.605105e-07    NA       0.002551020
## 10569 3.425775e-10 3.456607e-07    NA       0.010606061
## 13281 6.285656e-04 3.500760e-01    NA       0.000000000
## 13287 7.403131e-01 9.726249e-01    NA       0.000000000
## 10552 2.118219e-05 1.282370e-02    NA       0.000000000
## 10558 2.398082e-14 7.258993e-11    NA       0.000000000
## 10564 9.400880e-06 7.114116e-03    NA       0.007653061
## 10570 6.490596e-05 3.274506e-02    NA       0.000000000
## 13282 1.268576e-10 1.919990e-07    NA       0.003333333
## 13288 3.110579e-07 3.138574e-04    NA       0.000000000
## 10553 2.138408e-07 1.294592e-04    NA       0.000000000
## 10559 3.419487e-14 1.035079e-10    NA       0.000000000
## 10565 1.094080e-11 1.655891e-08    NA       0.004251701
## 10571 2.382627e-11 2.404071e-08    NA       0.006060606
## 13283 7.803913e-11 5.905611e-08    NA       0.000000000
## 13289 4.644012e-06 2.342904e-03    NA       0.000000000
## 10554 8.043885e-05 6.087210e-02    NA       0.000000000
## 10560 7.862402e-04 3.194668e-01    NA       0.000000000
## 10566 5.081291e-11 1.294836e-07    NA       0.003401361
## 10572 8.555245e-11 1.294836e-07    NA       0.006060606
## 13284 3.138583e-02 7.525620e-01    NA       0.003333333
## 13290 4.756235e-09 4.799042e-06    NA       0.000000000
##       ImputationPercentage
## 10549                    0
## 10555                    0
## 10561                    0
## 10567                    0
## 13279                    0
## 13285                    0
## 10550                    0
## 10556                    0
## 10562                    0
## 10568                    0
## 13280                    0
## 13286                    0
## 10551                    0
## 10557                    0
## 10563                    0
## 10569                    0
## 13281                    0
## 13287                    0
## 10552                    0
## 10558                    0
## 10564                    0
## 10570                    0
## 13282                    0
## 13288                    0
## 10553                    0
## 10559                    0
## 10565                    0
## 10571                    0
## 13283                    0
## 13289                    0
## 10554                    0
## 10560                    0
## 10566                    0
## 10572                    0
## 13284                    0
## 13290                    0
smoothScatter(pc$rotation[,1], Skyline.result.spikein[Skyline.result.spikein$Label == 'C2-C1', 'adj.pvalue'])

smoothScatter(pc$rotation[,1], Skyline.result.spikein[Skyline.result.spikein$Label == 'C3-C1', 'adj.pvalue'])

2.2 All 3025 proteins

2.2.1 PCA with prcomp function

Let’s repeat with all proteins (3025 proteins).

head(input.wide)
##                       JD_06232014_sample1_B.raw JD_06232014_sample1_C.raw
## sp|D6VTK4|STE2_YEAST                   26.81232                  26.60786
## sp|O13297|CET1_YEAST                   24.71912                  24.67437
## sp|O13329|FOB1_YEAST                   23.37678                  24.01464
## sp|O13539|THP2_YEAST                   27.52021                  27.38510
## sp|O13547|CCW14_YEAST                  27.22234                  26.80454
## sp|O13563|RPN13_YEAST                  26.09476                  26.29253
##                       JD_06232014_sample1-A.raw JD_06232014_sample2_A.raw
## sp|D6VTK4|STE2_YEAST                   26.58301                  26.83563
## sp|O13297|CET1_YEAST                   24.71809                  24.52341
## sp|O13329|FOB1_YEAST                   23.47075                  23.17165
## sp|O13539|THP2_YEAST                   24.29661                  25.88066
## sp|O13547|CCW14_YEAST                  27.11638                  27.17386
## sp|O13563|RPN13_YEAST                  26.17056                  25.90882
##                       JD_06232014_sample2_B.raw JD_06232014_sample2_C.raw
## sp|D6VTK4|STE2_YEAST                   26.79430                  26.60863
## sp|O13297|CET1_YEAST                   24.67593                  24.57865
## sp|O13329|FOB1_YEAST                   23.43427                  23.76997
## sp|O13539|THP2_YEAST                   26.10254                  25.90646
## sp|O13547|CCW14_YEAST                  26.91302                  26.88314
## sp|O13563|RPN13_YEAST                  26.12989                  26.01078
##                       JD_06232014_sample3_A.raw JD_06232014_sample3_B.raw
## sp|D6VTK4|STE2_YEAST                   26.62966                  26.49626
## sp|O13297|CET1_YEAST                   24.38955                  24.62652
## sp|O13329|FOB1_YEAST                   24.20100                  23.28643
## sp|O13539|THP2_YEAST                   25.65002                  26.33853
## sp|O13547|CCW14_YEAST                  26.56416                  26.75541
## sp|O13563|RPN13_YEAST                  26.17791                  26.03753
##                       JD_06232014_sample3_C.raw JD_06232014_sample4_B.raw
## sp|D6VTK4|STE2_YEAST                   26.53029                  26.60612
## sp|O13297|CET1_YEAST                   24.66197                  24.64886
## sp|O13329|FOB1_YEAST                   23.73741                  23.16646
## sp|O13539|THP2_YEAST                   25.91799                  26.70101
## sp|O13547|CCW14_YEAST                  26.95027                  26.98082
## sp|O13563|RPN13_YEAST                  26.11412                  26.07538
##                       JD_06232014_sample4_C.raw JD_06232014_sample4-A.raw
## sp|D6VTK4|STE2_YEAST                   26.38611                  26.65573
## sp|O13297|CET1_YEAST                   24.66559                  24.50814
## sp|O13329|FOB1_YEAST                   23.60780                  23.03473
## sp|O13539|THP2_YEAST                   25.91781                  25.07576
## sp|O13547|CCW14_YEAST                  26.80274                  27.07526
## sp|O13563|RPN13_YEAST                  26.05415                  25.77958
dim(input.wide)
## [1] 3025   12
pc <- prcomp(t(input.wide))

# Inspect PCA object
summary(pc)
## Importance of components%s:
##                           PC1    PC2    PC3    PC4     PC5    PC6     PC7
## Standard deviation     8.5615 8.0407 6.7776 6.3000 5.70469 5.0596 4.90438
## Proportion of Variance 0.1973 0.1740 0.1236 0.1068 0.08759 0.0689 0.06474
## Cumulative Proportion  0.1973 0.3713 0.4949 0.6018 0.68935 0.7582 0.82299
##                           PC8     PC9    PC10    PC11      PC12
## Standard deviation     4.6259 4.09531 3.90540 3.51317 5.935e-14
## Proportion of Variance 0.0576 0.04514 0.04105 0.03322 0.000e+00
## Cumulative Proportion  0.8806 0.92573 0.96678 1.00000 1.000e+00
names(pc)
## [1] "sdev"     "rotation" "center"   "scale"    "x"

2.2.2 Check the proportion of explained variance

Let’s check the proportion of explained variance. The first component has the largest variance. In this case, we need 10-11 components to capture most of the variation.

percent_var <- pc$sdev^2/sum(pc$sdev^2)
barplot(percent_var, xlab="Principle component", ylab="% of variance")

cum_var <- cumsum(pc$sdev^2/sum(pc$sdev^2))
barplot(cum_var, xlab="Principle component", ylab="Cumulative % of variance" )

2.1.3 Visualization for PC1 vs PC2

Let’s visualize PC1 vs PC2 in scatterplot. ‘x’ include PC components for each subject.

# 'x' include PC components for each subject.
head(pc$x)
##                                 PC1         PC2        PC3         PC4
## JD_06232014_sample1_B.raw -1.917574   3.4164862 -10.015293  0.03980838
## JD_06232014_sample1_C.raw  5.225472   6.3347577  -4.182085 -0.74346099
## JD_06232014_sample1-A.raw -3.884473   1.6034036 -10.796357 -2.19975140
## JD_06232014_sample2_A.raw  1.197283 -17.3294296  -2.563257  8.54857860
## JD_06232014_sample2_B.raw 10.822842   0.1850169  -1.769981  2.69840938
## JD_06232014_sample2_C.raw 11.145497  -0.8122667   2.961406  8.15190088
##                                   PC5        PC6        PC7         PC8
## JD_06232014_sample1_B.raw -2.13063047  0.1130047 -3.4227747 -0.71295527
## JD_06232014_sample1_C.raw  5.01570041 -5.2743242 -7.2867330  5.12845510
## JD_06232014_sample1-A.raw -0.05232091  2.0300503 -3.2690895 -4.12929066
## JD_06232014_sample2_A.raw  3.57323378  7.7025759 -0.7285404 -0.02193047
## JD_06232014_sample2_B.raw -3.58504139 -6.1833969  5.7342577 -9.05433684
## JD_06232014_sample2_C.raw -1.45746419 -4.0542433  2.4118556  7.20691739
##                                   PC9      PC10      PC11         PC12
## JD_06232014_sample1_B.raw  5.18678879 -1.663498 -8.036395 5.787211e-14
## JD_06232014_sample1_C.raw -3.46516758 -5.352419  3.151916 5.632170e-14
## JD_06232014_sample1-A.raw -2.25243930  8.086692  3.826111 5.719036e-14
## JD_06232014_sample2_A.raw -0.80119110 -3.053138  0.808913 5.869350e-14
## JD_06232014_sample2_B.raw  1.38105058 -3.053447  2.280873 5.735907e-14
## JD_06232014_sample2_C.raw  0.08197253  6.577174 -1.924547 5.572777e-14
ggplot(aes(x=PC1, y=PC2), data=data.frame(pc$x))+
  geom_point(size=4, alpha=0.5)+
  theme_bw()

In order to make different colors or shapes by groups, create Create vector of disease labels variable called Condition.

colnames(input.wide)
##  [1] "JD_06232014_sample1_B.raw" "JD_06232014_sample1_C.raw"
##  [3] "JD_06232014_sample1-A.raw" "JD_06232014_sample2_A.raw"
##  [5] "JD_06232014_sample2_B.raw" "JD_06232014_sample2_C.raw"
##  [7] "JD_06232014_sample3_A.raw" "JD_06232014_sample3_B.raw"
##  [9] "JD_06232014_sample3_C.raw" "JD_06232014_sample4_B.raw"
## [11] "JD_06232014_sample4_C.raw" "JD_06232014_sample4-A.raw"
Condition <- c(rep("Condition1", 3), rep("Condition2", 3),
               rep("Condition3", 3), rep("Condition4", 3))
Condition
##  [1] "Condition1" "Condition1" "Condition1" "Condition2" "Condition2"
##  [6] "Condition2" "Condition3" "Condition3" "Condition3" "Condition4"
## [11] "Condition4" "Condition4"
# Create PC1 vs PC2 scatterplot with Condition colors
ggplot(aes(x=PC1, y=PC2, color=Group), data=data.frame(pc$x, Group=Condition))+
  geom_point(size=4, alpha=0.5)+
  theme_bw()

With more number proteins, we need more PC components to reach most of variation.


3. Heatmap

3.1 Only 6 spike-in proteins

3.1.1 matrix format

# check the class
class(input.wide.spike)
## [1] "data.frame"
# It is data.frame. Convert to numeric matrix
input.wide.spike <- as.matrix(input.wide.spike)
class(input.wide.spike)
## [1] "matrix"
# Visually no difference
head(input.wide.spike)
##                      JD_06232014_sample1_B.raw JD_06232014_sample1_C.raw
## sp|P44015|VAC2_YEAST                  26.30163                  26.11643
## sp|P44374|SFG2_YEAST                  24.26559                  24.27609
## sp|P44683|PGA4_YEAST                  22.47239                  22.56104
## sp|P44983|UTR6_YEAST                  21.29173                  21.38781
## sp|P55249|ZRT4_YEAST                  21.86421                  21.41435
## sp|P55752|ISCB_YEAST                  27.47412                  27.69489
##                      JD_06232014_sample1-A.raw JD_06232014_sample2_A.raw
## sp|P44015|VAC2_YEAST                  26.29089                  25.81957
## sp|P44374|SFG2_YEAST                  24.23382                  21.36784
## sp|P44683|PGA4_YEAST                  22.52561                  20.06196
## sp|P44983|UTR6_YEAST                  21.31825                  27.51134
## sp|P55249|ZRT4_YEAST                  21.18725                  27.93246
## sp|P55752|ISCB_YEAST                  27.34912                  24.59934
##                      JD_06232014_sample2_B.raw JD_06232014_sample2_C.raw
## sp|P44015|VAC2_YEAST                  26.11527                  26.08498
## sp|P44374|SFG2_YEAST                  21.20330                  21.19631
## sp|P44683|PGA4_YEAST                  20.18183                  20.72974
## sp|P44983|UTR6_YEAST                  27.46338                  27.33788
## sp|P55249|ZRT4_YEAST                  27.75780                  27.83651
## sp|P55752|ISCB_YEAST                  24.62946                  24.70403
##                      JD_06232014_sample3_A.raw JD_06232014_sample3_B.raw
## sp|P44015|VAC2_YEAST                  23.14806                  23.32465
## sp|P44374|SFG2_YEAST                  26.90000                  26.85056
## sp|P44683|PGA4_YEAST                  21.72482                  21.93912
## sp|P44983|UTR6_YEAST                  26.94886                  26.92480
## sp|P55249|ZRT4_YEAST                  21.04481                  21.00220
## sp|P55752|ISCB_YEAST                  21.37264                  20.27802
##                      JD_06232014_sample3_C.raw JD_06232014_sample4_B.raw
## sp|P44015|VAC2_YEAST                  23.29555                  20.94536
## sp|P44374|SFG2_YEAST                  26.90190                  26.61875
## sp|P44683|PGA4_YEAST                  21.96229                  29.20591
## sp|P44983|UTR6_YEAST                  26.85774                  23.86204
## sp|P55249|ZRT4_YEAST                  21.02761                  20.71864
## sp|P55752|ISCB_YEAST                  20.44112                  27.36941
##                      JD_06232014_sample4_C.raw JD_06232014_sample4-A.raw
## sp|P44015|VAC2_YEAST                  21.71424                  20.25209
## sp|P44374|SFG2_YEAST                  26.60961                  26.69464
## sp|P44683|PGA4_YEAST                  29.12515                  29.21482
## sp|P44983|UTR6_YEAST                  23.76070                  24.01492
## sp|P55249|ZRT4_YEAST                  20.69131                  20.37886
## sp|P55752|ISCB_YEAST                  27.53762                  27.34758

3.1.2 heatmap function in base stats package

First, let’s try to draw heatmap with base function.

heatmap(input.wide.spike, scale='none')

heatmap(input.wide.spike, scale='row')

heatmap(input.wide.spike, scale='column')

# Change the font of row and column label
heatmap(input.wide.spike, cexRow = 0.6, cexCol = 0.6, margins = c(2, 2))

# Don't do cluster on rows
heatmap(input.wide.spike, cexRow = 0.6, cexCol = 0.6, margins = c(2, 2), Rowv = NA)

# Don't do cluster on columns
heatmap(input.wide.spike, cexRow = 0.6, cexCol = 0.6, margins = c(2, 2), Colv = NA)

3.1.3 Color bar for group information

Add color side bar at th top of columns to distinguish group information by run.

group.color <- c(rep("red", 3), rep("orange", 3),
               rep("yellow", 3), rep("blue", 3))
heatmap(input.wide.spike, ColSideColors=group.color)

3.1.4 Color scale

If color scale changed, it is easy to see the difference.

library(marray)
## Loading required package: limma
my.colors <- c(maPalette(low = "darkblue", high = "white", k = 7)[-7],
               "white",maPalette(low = "white", high = "darkred", k = 7)[-1])
heatmap(input.wide.spike, 
        col=my.colors, 
        ColSideColors=group.color,
        scale='none')

3.1.5 Different distance and clustering

Try different distances calculation and clustering methods. Choice of distance metric or clustering matters!

  • Distance options: euclidean (default), maximum, canberra, binary, minkowski, manhattan

  • Cluster options: complete (default), single, average, mcquitty, median, centroid, ward

# can change method for distance calculation
col_distance <- dist(t(input.wide.spike), method = "euclidean")
# can change clustering method
col_cluster <- hclust(col_distance, method = "ward.D")

heatmap(input.wide.spike, 
        col=my.colors, 
        ColSideColors=group.color,
        Colv = as.dendrogram(col_cluster),
        scale='none')

3.2 All proteins (3025 proteins)

3.2.1 matrix format

# check the class
class(input.wide)
## [1] "data.frame"
# It is data.frame. Convert to numeric matrix
input.wide <- as.matrix(input.wide)
class(input.wide)
## [1] "matrix"
# Visually no difference
head(input.wide)
##                       JD_06232014_sample1_B.raw JD_06232014_sample1_C.raw
## sp|D6VTK4|STE2_YEAST                   26.81232                  26.60786
## sp|O13297|CET1_YEAST                   24.71912                  24.67437
## sp|O13329|FOB1_YEAST                   23.37678                  24.01464
## sp|O13539|THP2_YEAST                   27.52021                  27.38510
## sp|O13547|CCW14_YEAST                  27.22234                  26.80454
## sp|O13563|RPN13_YEAST                  26.09476                  26.29253
##                       JD_06232014_sample1-A.raw JD_06232014_sample2_A.raw
## sp|D6VTK4|STE2_YEAST                   26.58301                  26.83563
## sp|O13297|CET1_YEAST                   24.71809                  24.52341
## sp|O13329|FOB1_YEAST                   23.47075                  23.17165
## sp|O13539|THP2_YEAST                   24.29661                  25.88066
## sp|O13547|CCW14_YEAST                  27.11638                  27.17386
## sp|O13563|RPN13_YEAST                  26.17056                  25.90882
##                       JD_06232014_sample2_B.raw JD_06232014_sample2_C.raw
## sp|D6VTK4|STE2_YEAST                   26.79430                  26.60863
## sp|O13297|CET1_YEAST                   24.67593                  24.57865
## sp|O13329|FOB1_YEAST                   23.43427                  23.76997
## sp|O13539|THP2_YEAST                   26.10254                  25.90646
## sp|O13547|CCW14_YEAST                  26.91302                  26.88314
## sp|O13563|RPN13_YEAST                  26.12989                  26.01078
##                       JD_06232014_sample3_A.raw JD_06232014_sample3_B.raw
## sp|D6VTK4|STE2_YEAST                   26.62966                  26.49626
## sp|O13297|CET1_YEAST                   24.38955                  24.62652
## sp|O13329|FOB1_YEAST                   24.20100                  23.28643
## sp|O13539|THP2_YEAST                   25.65002                  26.33853
## sp|O13547|CCW14_YEAST                  26.56416                  26.75541
## sp|O13563|RPN13_YEAST                  26.17791                  26.03753
##                       JD_06232014_sample3_C.raw JD_06232014_sample4_B.raw
## sp|D6VTK4|STE2_YEAST                   26.53029                  26.60612
## sp|O13297|CET1_YEAST                   24.66197                  24.64886
## sp|O13329|FOB1_YEAST                   23.73741                  23.16646
## sp|O13539|THP2_YEAST                   25.91799                  26.70101
## sp|O13547|CCW14_YEAST                  26.95027                  26.98082
## sp|O13563|RPN13_YEAST                  26.11412                  26.07538
##                       JD_06232014_sample4_C.raw JD_06232014_sample4-A.raw
## sp|D6VTK4|STE2_YEAST                   26.38611                  26.65573
## sp|O13297|CET1_YEAST                   24.66559                  24.50814
## sp|O13329|FOB1_YEAST                   23.60780                  23.03473
## sp|O13539|THP2_YEAST                   25.91781                  25.07576
## sp|O13547|CCW14_YEAST                  26.80274                  27.07526
## sp|O13563|RPN13_YEAST                  26.05415                  25.77958

3.2.3 Color bar and non-scaled data

Without scale in rows, it is hard to see the difference between groups per protein.

group.color <- c(rep("red", 3), rep("orange", 3),
               rep("yellow", 3), rep("blue", 3))

# remove row information (Protein IDs)
heatmap(input.wide, 
        ColSideColors=group.color,
        col=my.colors,
        labRow = FALSE,
        scale = 'none')

Protein quantification will be centered and scaled in the row direction.

# no Protein IDs
heatmap(input.wide, 
        ColSideColors=group.color,
        col=my.colors,
        labRow = FALSE,
        scale = 'row')

If we change distance calculation and/or clustering method, dendrogram is different. In this case, clustering between methods are not much different.

# can change method for distance calculation
col_distance <- dist(t(input.wide), method = "euclidean")
# can change clustering method
col_cluster <- hclust(col_distance, method = "ward.D")

heatmap(input.wide, 
        col=my.colors, 
        ColSideColors=group.color,
        Colv = as.dendrogram(col_cluster),
        labRow = FALSE,
        scale = 'row')


4. Biological data

4.1 Read subject quantification per protein and handle missing values.

In this section we’ll continue using mpg and CRC dataset.

crc <- read.csv("./data_day3/CRC_train.csv")
head(crc)
##      A1AG2      AFM     AHSG AIAG.Bovine     ANT3      AOC3     APOB
## 1 14.23816 16.10302 19.95179    15.25354 17.20794 10.033227 15.54477
## 2 15.02411 16.02071 19.71592    15.15455 17.29790  9.035202 15.13188
## 3 15.63136 16.14380 19.71085    15.59163 17.59625 10.382938 15.95530
## 4 15.40137 16.27642 19.70438    15.11819 17.42250  9.504018 15.71493
## 5 16.00316 16.95821 20.42033    15.58249 17.98820  9.651676 16.24733
## 6 13.93242 16.52772 19.88985    13.37131 16.32493  9.521379 14.51862
##       ATRN      BTD   C20orf3     CADM1    CD163      CD44     CDH5
## 1 14.38339 16.28307 10.660954  9.743511 10.94965  9.943858 8.749720
## 2 13.98172 16.24919 10.702064  9.702175 10.83449 10.425750 9.056710
## 3 14.63536 16.49916 11.188267 10.373824 11.22385 11.026696 9.477187
## 4 14.06070 16.27773  9.966157 10.105507 11.17723 11.103752 9.866905
## 5 14.20359 16.54142 12.574731  9.690320 12.23111 11.166600 8.522353
## 6 13.70165 15.79196 10.578714  9.190440 12.21289  8.038643 9.225166
##        CFH      CFI      CLU       CP      CTSD DKFZp686N02209     DSG2
## 1 17.26186 16.52607 19.32642 16.99737  9.275675       20.70295 10.22517
## 2 17.26762 16.86256 19.40444 16.22411 10.865768       21.86027 10.04611
## 3 17.98868 17.20551 19.53746 17.75641 10.029862       21.84249 11.11479
## 4 18.17423 17.52567 19.38313 18.05366 10.203826       20.70488 10.95813
## 5 18.82220 18.47344 20.19405 16.23669 10.941220       21.18504 10.87991
## 6 16.13991 16.94726 18.90905 16.34736 11.858181       21.77231 10.23335
##        ECM1      F11       F5    FCGBP FETUA.Bovine    FETUB       FGA
## 1 13.113391 14.81509 12.23214 12.38189     17.04865 13.29704  8.144845
## 2 12.569733 14.71162 12.14809 11.68763     17.07832 13.50235  9.305395
## 3 13.661017 15.56122 12.80789 12.40291     17.12503 13.55688  9.432872
## 4 13.545199 15.95389 13.18926 13.03992     17.06354 13.12742  9.746036
## 5 13.371014 15.98843 12.54162 13.23842     17.02016 14.95305 10.066428
## 6  9.915183 14.69724 11.62907 12.69986     17.08611 13.19051        NA
##        FGG     FHR3      FN1    GOLM1       HP      HRG    HYOU1    ICAM1
## 1 11.34954 11.52154 12.55169 8.529159 19.74767 17.47524 7.504040 9.090639
## 2 12.19192 10.73714 13.47176       NA 20.12543 17.95990 9.764873 9.253566
## 3 11.49952 11.56901 12.36536       NA 20.78502 17.57534 8.646144 9.369628
## 4 11.87189 13.40320 12.17770       NA 21.87358 17.38916 8.868061 9.760900
## 5 11.59058 12.56019 14.10296 9.749398 22.79789 17.95183 8.115907 9.887392
## 6 10.68147 10.05570 11.81432 9.913008 20.71284 16.76395 8.842497 9.027833
##      IGFBP3    IGHA2    IGHG2    ITIH4    KLKB1     KNG1    LAMP2
## 1 10.918418 18.90573 22.10597 15.64604 14.12489 17.65508 13.68160
## 2 10.357056 21.04688 22.44804 15.36709 14.19172 16.98601 13.48692
## 3 11.619532 18.87877 22.95173 16.40141 14.09215 17.39206 14.12212
## 4 11.425023 20.89932 22.93422 16.18817 13.48183 17.82389 14.41111
## 5 10.175458 21.26317 21.26145 15.87709 15.19997 17.95218 15.17730
## 6  9.527139 21.92661 21.00712 14.94386 13.20866 17.39358 13.94826
##        LCN2 LGALS3BP     LRG1      LUM    LYVE1    MMRN1       MPO
## 1  9.446541 14.82147 13.96127 14.99193 11.86759 11.39385  9.729238
## 2 10.623488 15.12812 14.08645 15.41582 11.71390 11.15791 10.202602
## 3 10.024225 14.20939 15.05311 15.06369 12.19818 11.73116  9.782391
## 4 10.109405 14.87094 14.92352 14.60999 11.77828 11.95025 10.761889
## 5 11.052435 16.55743 16.46953 15.93393 12.92101 12.33475 11.727629
## 6  9.355121 16.28340 14.49637 14.47633 12.45132 11.68845  9.834259
##       MRC2     MST1     NCAM1     ORM1     PGCP      PIGR     PLTP
## 1 10.86842 12.96116  9.905815 16.74179 8.379739  9.744116 11.85560
## 2 10.01434 13.09922 10.496499 17.24994 8.406487  9.422932 11.54273
## 3 10.56038 14.11266 10.521723 18.41524 8.243928  9.306936 12.15338
## 4 10.02103 13.92377 10.385096 18.09024 9.114961  9.417459 11.19101
## 5 11.41932 13.10718 10.517894 18.28100 8.778907 11.389195 12.12600
## 6 10.91711 13.37761  9.854809 17.02724 8.368942  9.466000 11.35824
##      PLXDC2     PON1     PRG4      PROC     PTPRJ   Q5JNX2 SERPINA1
## 1 10.050261 16.87047 11.84463 11.109446  9.758946 19.02813 18.06109
## 2  9.974399 16.55428 12.49583 11.111383  9.934520 19.86254 17.61019
## 3  9.812975 16.34508 11.70096 11.533164 10.328319 19.50177 18.53997
## 4  9.847176 15.46083 11.09706 11.693945  9.741059 19.98466 17.40141
## 5 10.414638 16.46015 14.46691 10.699745 10.568064 20.46745 17.44801
## 6  8.387396 14.47485 11.85023  9.470798  9.922271 18.14204 17.95217
##   SERPINA3 SERPINA6 SERPINA7    THBS1    TIMP1       TNC      VTN      VWF
## 1 14.21804 15.79685 13.58185 13.99569 11.57875 10.275261 12.06683 10.66103
## 2 14.83233 15.81540 13.13845 13.70771 11.96097 10.234434 12.83689 10.77843
## 3 15.30873 16.40177 13.74233 15.63696 12.15584 10.241626 12.31163 11.18527
## 4 14.25424 16.31049 13.93689 15.51727 12.37927  9.269195 10.67764 10.88786
## 5 16.23191 16.48583 14.16713 15.12911 12.16912 10.476927 14.50459 11.25418
## 6 12.60729 15.29659 13.23403 15.39790 12.52729 10.026299 11.52916 11.03501
##   Sample Group Age Gender Cancer_stage Tumour_location Sub_group
## 1  P1A10   CRC  60 female            1           colon       CRC
## 2   P1A2   CRC  70   male            1          rectum       CRC
## 3   P1A4   CRC  65   male            1          rectum       CRC
## 4   P1A6   CRC  65 female            4           colon       CRC
## 5  P1B12   CRC  62 female            3           colon       CRC
## 6   P1B2   CRC  55   male            2           colon       CRC
colnames(crc)
##  [1] "A1AG2"           "AFM"             "AHSG"           
##  [4] "AIAG.Bovine"     "ANT3"            "AOC3"           
##  [7] "APOB"            "ATRN"            "BTD"            
## [10] "C20orf3"         "CADM1"           "CD163"          
## [13] "CD44"            "CDH5"            "CFH"            
## [16] "CFI"             "CLU"             "CP"             
## [19] "CTSD"            "DKFZp686N02209"  "DSG2"           
## [22] "ECM1"            "F11"             "F5"             
## [25] "FCGBP"           "FETUA.Bovine"    "FETUB"          
## [28] "FGA"             "FGG"             "FHR3"           
## [31] "FN1"             "GOLM1"           "HP"             
## [34] "HRG"             "HYOU1"           "ICAM1"          
## [37] "IGFBP3"          "IGHA2"           "IGHG2"          
## [40] "ITIH4"           "KLKB1"           "KNG1"           
## [43] "LAMP2"           "LCN2"            "LGALS3BP"       
## [46] "LRG1"            "LUM"             "LYVE1"          
## [49] "MMRN1"           "MPO"             "MRC2"           
## [52] "MST1"            "NCAM1"           "ORM1"           
## [55] "PGCP"            "PIGR"            "PLTP"           
## [58] "PLXDC2"          "PON1"            "PRG4"           
## [61] "PROC"            "PTPRJ"           "Q5JNX2"         
## [64] "SERPINA1"        "SERPINA3"        "SERPINA6"       
## [67] "SERPINA7"        "THBS1"           "TIMP1"          
## [70] "TNC"             "VTN"             "VWF"            
## [73] "Sample"          "Group"           "Age"            
## [76] "Gender"          "Cancer_stage"    "Tumour_location"
## [79] "Sub_group"
dim(crc)
## [1] 200  79
crc.prot <- crc[,1:72]
crc.anno <- crc[,73:79]

## check missing value
sum(is.na(crc.prot))
## [1] 215
# Deal with missing value
# First option: remove the samples with missing values
dim(na.omit(crc.prot))
## [1] 89 72
# Second option: impute the missing values
random.imp <- function (a){
  missing <- is.na(a)
  n.missing <- sum(missing)
  a.obs <- a[!missing]
  imputed <- a
  # imputed[missing] <- 0
  # imputed[missing] <- sample(a.obs, n.missing, replace=TRUE)
  # imputed[missing] <- median(a.obs)
  imputed[missing] <- min(a.obs)
  return (imputed)
}

pMiss <- function(x){
    sum(is.na(x))/length(x)*100
}

# 1. Only keep the proteins with less than 5% missing values
proteinmissing <- apply(crc.prot, 2, pMiss)
selectedprotein <- which(proteinmissing <= 5) 
crc.prot <- crc.prot[, selectedprotein]

# 2. Only keep the samples with less than 5% missing values
samplemissing <- apply(crc.prot, 1, pMiss)
selectedSamples <- which(samplemissing <= 5) 
# Then, impute the missing value
imputed.crc.prot <- t(apply(crc.prot[selectedSamples,], 1, function(x) random.imp(x)))
imputed.crc.anno <- crc.anno[selectedSamples, ]
dim(imputed.crc.anno)
## [1] 200   7

4.2 Heatmap of individual values of proteins

4.2.1 Default heatmap

library(genefilter)
## heatmap function in base stats package
heatmap(t(imputed.crc.prot))

4.2.2 Change color

# Non-scaled 
heatmap(t(imputed.crc.prot), cexRow = 0.3, cexCol = 0.2, margins = c(2, 2), Colv = NA, scale='none')

# scaled
heatmap(t(imputed.crc.prot), cexRow = 0.3, cexCol = 0.2, margins = c(2, 2), Colv = NA)

heatmap(t(imputed.crc.prot), cexRow = 0.3, cexCol = 0.2, margins = c(2, 2), Colv = NA,
        col=my.colors)

4.2.3 Color bar for group information

unique(imputed.crc.anno$Sub_group)
## [1] CRC     Benign  Healthy
## Levels: Benign CRC Healthy
myColor <- rep("blue", nrow(imputed.crc.prot))
myColor[imputed.crc.anno$Sub_group == "CRC"] <- "red" 
myColor[imputed.crc.anno$Sub_group == "Benign"] <- "yellow" 

4.2.4 Standardized in a row (a protein)

quantile-spaced breaks in positive and negative directions

# change color code: protein-standardized expressions
# quantile-spaced breaks in positive and negative directions
imputed_CRC_prot_centerScale <-  ( t(imputed.crc.prot) - rowMeans(t(imputed.crc.prot)) ) / rowSds(t(imputed.crc.prot))

exprs_brk <- c( quantile(imputed_CRC_prot_centerScale[imputed_CRC_prot_centerScale < 0], probs=seq(0, 1, length=10)), 0, quantile(imputed_CRC_prot_centerScale[imputed_CRC_prot_centerScale > 0], seq(0, 1, length=10)))
colors <- colorRampPalette(c("darkblue", "white", "darkred"))(length(exprs_brk)-1)

heatmap(t(imputed.crc.prot), 
        cexRow = 0.3, cexCol = 0.2, margins = c(2, 2), Rowv = NA,
        col = colors, breaks = exprs_brk, ColSideColors=myColor)

4.2.5 Different distance and clustering

# can change method for distance calculation
col_distance <- dist(imputed.crc.prot, method = "euclidean")
# can change clustering method
col_cluster <- hclust(col_distance, method = "ward.D")

heatmap(t(imputed.crc.prot),
        cexRow = 0.3, cexCol = 0.2, margins = c(2, 2), 
        col = colors, 
        breaks = exprs_brk, 
        ColSideColors = myColor,
        Rowv = NA, 
        Colv = as.dendrogram(col_cluster))